/******************
Filename: rhtp_analyses_jama.sas
Written by: Eliza Macneal
Update Date: February 13, 2026

Description:
This file produces a dataset reporting the following estimates at the state level:
- Rural population, 2020, based on HRSA-designated rural census tracts
- N rural hospital beds, 2017 & 2022
- % change in N rural hospital beds, 2017-2022
- N rural physicians per 100,000 rural residents, 2017 & 2022
- % change in N rural physicians per 100,000 rural residents, 2017-2022
- Age-adjusted rural all-cause mortality per 100,000 population, averaged 2019-2023

Sources:
Rural census tracts and counties: HRSA
Population (census tract level): 2020 Census
N hospital beds: AHRF 2018-2019 & 2023-2024
N physicians: AHRF 2018-2019 & 2023-2024
Age-adjusted all-cause mortality per 100,000: HDPulse

Notes:
- N rural hospital beds and N rural physicians are calculated by weighting county-level counts by the rural proportion
	of the county. For instance, a county with 1000 hospital beds and a 50% rural population contributes 500 hospital beds
	towards the state-level count of rural hospital beds. The rural proportion of the county is calculated based on 2020 Census
	census-tract populations and HRSA-designated 2020 rural census tracts.
- Age-adjusted rural mortality is calculated by limiting to HRSA-designated fully rural counties and weighting by county
	population as a proportion of total state population in fully rural counties. For privacy reasons, mortality is not
	publicly reported at the census-tract level.
- HRSA's county-level dataset reports full rurality in Connecticut for its nine county-equivalent planning regions, rather than its eight
	counties. HDPulse reports age-adjusted mortality in Connecticut data for its counties rather than its planning regions.
	Since 100% of census tracts in Windham county are HRSA-designated as rural, we count Windham as Connecticut's one fully rural county.
	See map of Connecticut planning regions vs counties: https://img.federalregister.gov/EN06JN22.001/EN06JN22.001_original_size.png
	Note that Litchfield county, corresponding approximately to Northwest Hills planning region, contains non-rural census tracts and thus
	is not counted as a fully rural county.

Prerequisites:
'<insert correct path>/SAS code/AHRF2018-19.sas' (From AHRF 2018-2019 data download. Reads in ASC file of AHRF 2019 data and outputs SAS dataset)

Libraries:
rht='<insert correct path>/SAS datasets/Analytical'
rht_src='<insert correct path>/SAS datasets/Source'

Input:
rht_src.hrsa_rural_counties
rht_src.hrsa_rural_census_tracts
rht_src.census_trct_pop_2020
rht_src.ahrf2019 (produced by 'AHRF2018-19.sas')
rht_src.ahrf2024_feb2025
rht_src.county_mortality

Output:
rht.state_estimates

******************/

********************************************************************************************************;
* DECLARE LIBRARIES																						;
********************************************************************************************************;

libname rht '<insert correct path>/SAS datasets/Analytical';
libname rht_src '<insert correct path>/SAS datasets/Source';

********************************************************************************************************;
* IMPORT DATA																							;
********************************************************************************************************;

* Import HRSA rural county file;
proc import datafile= '<insert correct path>/Data sources/rural-health-areas-data-set.xlsx'
out= rht_src.hrsa_rural_counties
dbms=xlsx
replace;
sheet="County_Eligibility";
getnames=yes;
run;

* Import HRSA rural census tract file;
proc import datafile= '<insert correct path>/Data sources/rural-health-areas-data-set.xlsx'
out= rht_src.hrsa_rural_census_tracts
dbms=xlsx
replace;
sheet="Census_Tract_Eligibility";
getnames=yes;
run;

* Import 2020 Census census tract population data;
options nosource validvarname=v7;
data rht_src.census_trct_pop_2020;
   infile  '<insert correct path>/Data sources/2020 Census population (census tract)/R50022909_SL140.csv' 
		firstobs=3 dsd truncover delimiter=',' lrecl=400000;
   label 
         QNAME='Qualifying Name'
         NAME='Name of Area'
         FIPS='FIPS'
         NATION='Nation'
         STATE='State'
         COUNTY='County'
         CS='County Subdivision'
         CT='Census Tract'
         BG='Block Group'
         PLACE='Place'
         CD='Congressional District (116th)'
         SLDUC='State Legislative District (Upper Chamber)'
         SLDLC='State Legislative District (Lower Chamber)'
         MSAMSA='Metropolitan Statistical Area/Micropolitan Statistical Area'
         CSA='Combined Statistical Area'
         V1T003001='2020 Total Population'
         V1T004001='2010 Total Population'
         V1T005001='Population Change'
         V1T006001='Percent Change Population'
         ;
length QNAME $60 NAME $30 FIPS $11;
input QNAME $ NAME $ FIPS $ NATION $ STATE $ COUNTY $ CS $ CT $ BG $ PLACE $ CD $ SLDUC $ SLDLC $ MSAMSA $ CSA $
	V1T003001 V1T004001 V1T005001 V1T006001;
run;

* Import county level mortality rates;
proc import datafile= "<insert correct path>/Data sources/HDPulse_county_age_adj_mort_2019_2023.csv"
out= rht_src.county_mortality
dbms=csv
replace;
getnames=yes;
run;

* Create dataset with relevant AHRF variables from 2017 and 2022;
* In Connecticut in 2022, pull in county-level population estimate from LAPI (ACS 2022 only reports planning region population);
data rht.ahrf;
	merge rht_src.ahrf2019
		(rename=(f0892117=hosp_beds_17 f1212917=md_nf_fed_activ_17
			f00002=fips_st_cnty f00005=date_file f00008=st_name f12424=st_name_abbrev f00010=cnty_name f1198417=popn_est_17))
		rht_src.ahrf2024_feb2025;
	by fips_st_cnty;
	rename fips_st_cnty=fips;
	popn_est_22=coalesce(popn_est_22, popn_22);
	keep fips_st_cnty st_name st_name_abbrev cnty_name hosp_beds_17 hosp_beds_22 md_nf_fed_activ_17 md_nf_fed_activ_22 popn_est_17 popn_est_22;
run;

********************************************************************************************************;
* GET 2020 RURAL POPULATION BY STATE																	;
********************************************************************************************************;

* Check whether Census Tract codes are correctly formatted in both datasets;
proc freq data=rht_src.census_trct_pop_2020;
	tables qname*fips;
	where fips ne cats(state,county,ct);
run;
proc freq data=rht_src.hrsa_rural_census_tracts;
	tables state*county_name_2023*tract_name_2020*ct_fips_2020 / list;
	where (ct_fips_2020 ne cats(state_fips,county_2023,tract_2020));
run;

* Fix formatting error in Bullock County in Alabama;
data rht_src.hrsa_rural_census_tracts;
	set rht_src.hrsa_rural_census_tracts;
	if ct_fips_2020="1011952202" then ct_fips_2020="01011952202";
run;

* Sum population over rural census tracts by state;
proc sql;
	create table rht.state_estimates as
	select distinct
		fipstate(input(a.state,best12.)) as state,
		sum(V1T003001) as rural_pop
	from rht_src.census_trct_pop_2020 a inner join rht_src.hrsa_rural_census_tracts(where=(forhp_rural="Yes")) b
	on a.fips=b.ct_fips_2020
	group by a.state
	having state ne "PR"; /* Exclude Puerto Rico */
quit;

********************************************************************************************************;
* GET AGE ADJUSTED RURAL MORTALITY BY STATE																;
********************************************************************************************************;

* Limit to HRSA-designated fully rural counties;
proc sql;
	create table rht.rural_county_mortality as
	select distinct fipstate(input(substr(a.fips,1,2),best12.)) as state,
		a.fips, a.All_Causes_of_Death_Rates__Table as county_name, input(a.var3,best12.) as adj_mort
	from rht_src.county_mortality(rename=(var2=fips) where=(not missing(fips) and input(substr(fips,3,3),best12.) in (1:999))) a
	left join rht_src.hrsa_rural_counties b
	on a.fips=b.fips_2023
	where b.county_eligibility="Fully FORHP Rural"
	/* Retain Windham countiy in Connecticut as a fully rural county (see note in document header) */
	or (substr(fips,1,2)="09" and county_name="Windham County")
	order by state;
quit;

* Merge in county population;
proc sql;
	create table rht.rural_county_mortality as
	select distinct a.*, sum(b.V1T003001) as pop_2020
	from rht.rural_county_mortality a left join rht_src.census_trct_pop_2020 b
	on a.fips=substr(b.fips,1,5)
	group by a.fips;
quit;

* Get state-level population-weighted age-adjusted mortality rate among fully rural counties;
proc sql;
	create table rht.state_estimates as
	select distinct a.*, sum(adj_mort*b.pop_2020)/sum(b.pop_2020) as adj_rural_mort
	from rht.state_estimates a left join rht.rural_county_mortality b
	on a.state=b.state
	group by a.state;
quit;

********************************************************************************************************;
* GET N AND PERCENT CHANGE IN RURAL HOSPITAL BEDS AND PHYSICIANS PER CAPITA, 2017-2022					;
********************************************************************************************************;

* Get rural proportion of county population in 2020;
proc sql;
	create table rht.county_rural_proportion as
	select distinct substr(a.fips,1,5) as fips,
		sum(case when b.forhp_rural="Yes" then V1T003001 else 0 end)/sum(V1T003001) as rural_pop_ratio
	from rht_src.census_trct_pop_2020 a left join rht_src.hrsa_rural_census_tracts b
	on a.fips=b.ct_fips_2020
	group by substr(a.fips,1,5);
quit;

* Merge rural proportion of county population to AHRF;
proc sql;
	create table rht.ahrf as
	select distinct a.*, b.rural_pop_ratio
	from rht.ahrf a left join rht.county_rural_proportion b
	on a.fips=b.fips;
quit;

* Weight N beds, N physicians, and total population by rural population ratio to estimate rural counts;
data rht.ahrf;
	set rht.ahrf;
	n_rural_hosp_beds_2017=hosp_beds_17*rural_pop_ratio;
	n_rural_hosp_beds_2022=hosp_beds_22*rural_pop_ratio;
	n_rural_phys_2017=md_nf_fed_activ_17*rural_pop_ratio;
	n_rural_phys_2022=md_nf_fed_activ_22*rural_pop_ratio;
	rural_population_2017=popn_est_17*rural_pop_ratio;
	rural_population_2022=popn_est_22*rural_pop_ratio;
run;

* Get N rural hospital beds and N rural physicians in 2017 and 2022 and % change across 2017-2022 by state;
proc sql;
	create table rht.state_estimates as 
	select distinct a.*,
		sum(b.n_rural_hosp_beds_2017) as n_rural_hosp_beds_2017, /* N rural hospital beds in 2017 */
		sum(b.n_rural_hosp_beds_2022) as n_rural_hosp_beds_2022, /* N rural hospital beds in 2022 */
		100000*sum(b.n_rural_phys_2017)/sum(b.rural_population_2017) as rural_phys_per_100k_2017, /* N rural physicians in 2017 */
		100000*sum(b.n_rural_phys_2022)/sum(b.rural_population_2022) as rural_phys_per_100k_2022 /* N rural physicians in 2022 */
	from rht.state_estimates a left join rht.ahrf b
	on a.state=b.st_name_abbrev
	group by a.state;
quit;

* Get % changes;
data rht.state_estimates;
	set rht.state_estimates;
	pct_change_rural_hosp_beds_17_22=100*(n_rural_hosp_beds_2022-n_rural_hosp_beds_2017)/n_rural_hosp_beds_2017;
	pct_ch_rural_phys_p100k_17_22=100*(rural_phys_per_100k_2022-rural_phys_per_100k_2017)/rural_phys_per_100k_2017;
run;

********************************************************************************************************;
* EXPORT RESULTS																						;
********************************************************************************************************;

proc sort data=rht.state_estimates; by state; run;
proc export data= rht.state_estimates
outfile= "<insert correct path>/Excel output/state_estimates.xlsx"
dbms=excel replace;
sheet="state_estimates";
run;
